Pipeline 4: OLS influence diagnosis

In [1]:
from pathlib import Path
import logging
import os
import functools
import re
import pprint as pp
import datetime
import warnings

import h5py
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns


import matplotlib as mpl
RC_PARAMS = dict(mpl.rcParams)
from matplotlib import pyplot as plt

import fitgrid
import fitgrid.utils as fgutil

from udck19_filenames import (
    EEG_EXPT_FILES,
    EEG_EPOCHS_DIR,
    EEG_MODELING_DIR, 
    PREPOCHS_TRMD_EEG_F,
    PREPOCHS_TRMD_EEG_COOKSD_F
)

from udck19_utils import (
    get_udck19_logger,
    check_ENV,
    N_EPOCH_SAMPS,  # epoch length in samples
    N_TRMD_EEG_EPOCHS,  # number of epochs after EEG screening in pipeline_1
    EEG_SCREEN_COL,  # HDF5 dataset key
    check_epochs_shape, 
    EEG_EXPT_SPECS, 
    EEG_26_STREAMS,
    RHS_VARS,
    LMER_MODELS,
    standardize,
    fit_lmer_formulas,
    read_fg_summaries_hdf,
    plotchans, 
    MPL_32_CHAN, 
    MPL_MIDLINE,
    udck19_figsave,
    panel_from_idx,
    FIG_TAG_SPECS,
)

# enforce active conda env
check_ENV()

# logging config
__file__ = 'udck19_pipeline_4.ipynb'
logging.shutdown()
LOGGER = get_udck19_logger(__file__)

pipeline_start = datetime.datetime.now()

LOGGER.info(f"""
udck19 Supplementary Materials 4
CONDA_DEFAULT_ENV: {os.environ['CONDA_DEFAULT_ENV']}
pandas: {pd.__version__} 
fitgrid: {fitgrid.__version__}
Start {pipeline_start.strftime("%d.%b %Y %H:%M:%S")}
""")
/home/turbach/.conda/envs/udck19_pnas_121819/lib/python3.6/site-packages/ipykernel_launcher.py:18: MatplotlibDeprecationWarning: 
The examples.directory rcparam was deprecated in Matplotlib 3.0 and will be removed in 3.2. In the future, examples will be found relative to the 'datapath' directory.
udck19_pipeline_4.ipynb:INFO:
udck19 Supplementary Materials 4
CONDA_DEFAULT_ENV: udck19_pnas_121819
pandas: 0.25.2 
fitgrid: 0.4.8
Start 19.Dec 2019 12:54:16

Notebook globals

In [2]:
# True to prerun LMER refitting on a subset of times and channels
PRERUN = False  # True


FIG_PREFIX = 'udck19_pipeline_4 Fig'            
FIG_COUNT = 1

# filename for the lmer fit summaries after Cooks D trimming
LMER_ACZ_RANEF_COOKSD_F = "lmer_acz_ranef_cooksD.h5"

# highlight ... alpha=0 to disable
n4_highlight = {
    'xmin': 300,
    'xmax': 500,
    'color': 'magenta',
    'alpha': 0.2
}

Ordinary least squares Cook's D

In a data set this large, no individual epoch is likely to be influential in the sense of changing parameter estimates.

Inspection of relatively influential may be informative (Fox 2008 p. 254)

Fit this OLS model to identify relatively influential single data points regardless of experiment, subject, or item.

EEG ~ 1 + article_cloze_z
In [3]:
%%time

# ready the eeg screened single trial epochs for fitgrid
prepochs_trmd_eeg_df = (
    pd.read_hdf(EEG_EPOCHS_DIR / PREPOCHS_TRMD_EEG_F).reset_index().set_index(['Epoch_idx', 'Time'])
)

# sanity check single trial epochs as screened in pipeline_1
assert (N_EPOCH_SAMPS, N_TRMD_EEG_EPOCHS) == check_epochs_shape(prepochs_trmd_eeg_df)
assert all([val == 'accept' for val in prepochs_trmd_eeg_df[EEG_SCREEN_COL]])

# standardize cloze 
prepochs_trmd_eeg_df, means_sd = standardize(
    prepochs_trmd_eeg_df, ['article_cloze', 'ART_noun_cloze', 'NA_noun_cloze']
)

prepochs_trmd_eeg_fg = fitgrid.epochs_from_dataframe(
    prepochs_trmd_eeg_df.loc[:, RHS_VARS + EEG_26_STREAMS],
    epoch_id='Epoch_idx',
    time='Time',
    channels=EEG_26_STREAMS
)


# fetch and export cooksD values
lm_acz_cooksD_label = "lm_acz_cooksD"

lm_acz_cooksD, _ = fgutil.lm.get_diagnostic(
        fitgrid.lm(
            prepochs_trmd_eeg_fg,
            RHS="article_cloze_z",
            LHS=EEG_26_STREAMS,
            parallel=True,
            n_cores=32
        ),
        "cooks_distance"
)

# ~1GB ... too big for deposition, rebuild locally
# cooksD_f = EEG_MODELING_DIR / (lm_acz_cooksD_label + ".h5")
# lm_acz_cooksD.to_hdf(cooksD_f, key=lm_acz_cooksD_label, mode='w')
100%|██████████| 375/375 [00:19<00:00, 19.01it/s]
CPU times: user 6min 35s, sys: 2min 1s, total: 8min 36s
Wall time: 3min 50s

Define extreme values Cooks D

Fox 2008 p. 255

Cook's Distance criterion for examination: $D_{i} > \frac{4}{n - k - 1}$

In [4]:
n = len(lm_acz_cooksD.index.unique('Epoch_idx'))
k = 2  # slope and intercept in the OLS model
lm_acz_cooksD_extreme_lb = 4 / (n - k - 1)
LOGGER.info(f"""
Extreme Cook's D criterion: {lm_acz_cooksD_extreme_lb:0.5f} = 4 / (n - k - 1) 
where
  n = {n} number of single trial epochs 
  k = {k} for OLS intercept, slope
""")
udck19_pipeline_4.ipynb:INFO:
Extreme Cook's D criterion: 0.00033 = 4 / (n - k - 1) 
where
  n = 12043 number of single trial epochs 
  k = 2 for OLS intercept, slope

In [5]:
# mask for extreme Cook's D time points by epoch and channels 
is_extreme_cooksD = (lm_acz_cooksD > lm_acz_cooksD_extreme_lb)

# Count number of extreme Cook's D time points by epoch and channels ... slow
lm_acz_cooksD_extreme_counts = (
    is_extreme_cooksD
).astype('u1').groupby('Epoch_idx').sum(axis=0)

Lookup extreme Cook's D values in the epochs

In [6]:
# maximum allowed extreme Cooks D data points per epoch on any channel
max_cooksD_extreme_proportion = 0.15

# how many data points per epoch can be above criterion? 
max_cooksD_extreme_n = int(np.floor(max_cooksD_extreme_proportion * N_EPOCH_SAMPS)) 
LOGGER.info(
    "Maximum allowed extreme Cook's D data points per epoch on any channel:"
    f" {100 * max_cooksD_extreme_proportion}% of {N_EPOCH_SAMPS} timepoints = {max_cooksD_extreme_n}"
)

# Epoch ids that pass the Cook's D screening ... all these epochs
# have no more than the allowed proportion of Cook's D extreme values on 
# all channels
trmd_cooksD_epoch_ids = lm_acz_cooksD_extreme_counts.where(
        lm_acz_cooksD_extreme_counts < max_cooksD_extreme_n
    ).dropna(how='any').index.unique('Epoch_idx')

# sanity check that worst case *retained* epoch_ids are still 
# *below* the Cook's D exclusion threshold
assert all(
    lm_acz_cooksD_extreme_counts.query("Epoch_idx in @trmd_cooksD_epoch_ids").max() 
    < max_cooksD_extreme_n
)

lm_acz_cooksD_extreme_counts_mean = lm_acz_cooksD_extreme_counts.query(
    "Epoch_idx in @trmd_cooksD_epoch_ids"
).mean() / N_EPOCH_SAMPS

LOGGER.info(f"""

Average proportion of extreme Cook's D samples per epoch:
overall: {lm_acz_cooksD_extreme_counts_mean.mean():.3f}
by channel:
{lm_acz_cooksD_extreme_counts_mean}
""")
        
udck19_pipeline_4.ipynb:INFO:Maximum allowed extreme Cook's D data points per epoch on any channel: 15.0% of 375 timepoints = 56
udck19_pipeline_4.ipynb:INFO:

Average proportion of extreme Cook's D samples per epoch:
overall: 0.019
by channel:
diagnostic      channel
cooks_distance  RLPf       0.017470
                RMPf       0.020071
                RDFr       0.020374
                RLFr       0.018801
                RMFr       0.020358
                RMCe       0.018933
                RDCe       0.020256
                RDPa       0.017756
                RLTe       0.019961
                RLOc       0.018084
                RMOc       0.016744
                MiPf       0.017003
                MiCe       0.020021
                MiPa       0.018142
                MiOc       0.017489
                LLPf       0.016404
                LMPf       0.020568
                LDFr       0.019883
                LLFr       0.018479
                LMFr       0.019957
                LMCe       0.018549
                LDCe       0.019450
                LDPa       0.017721
                LLTe       0.019563
                LLOc       0.018495
                LMOc       0.016139
dtype: float64

Distribution of Cook's D values

Solid red indicates quantile, dotted red indicates maximum value

In [7]:
def distplot_cooksD(values, extreme_lb, ax_kws={}):

    f, axs = plt.subplots(
        len(values.columns), 2, 
        figsize=(12, 1 * len(values.columns)),
        sharey='col',
    )
    for axi, (diag, chan) in enumerate(values.columns):
        #print(col)
        cooks_D = values[(diag, chan)]

        # Cook's D 
        ax = axs[axi, 0]
        sns.distplot(cooks_D, kde=False, bins=50, ax=ax)
        ax.axvline(x=extreme_lb, color='red', lw=2)
        ax.axvline(x=cooks_D.max(), color='red', lw=1)
        ax.set(xlabel="", ylabel=f"{chan}")
        if axi == 0:
            ax.set(title='CooksD')
        
        # log10 Cook's D
        trans_fun = np.log10
        ax = axs[axi, 1]
        sns.distplot(
            trans_fun(cooks_D), kde=False, bins=50, ax=ax
        )
        ax.axvline(x=trans_fun(extreme_lb), color='red', lw=2)
        ax.axvline(
            x=trans_fun(cooks_D.max()),
            color='red',
            lw=1
        )
        if axi == 0:
            ax.set(title='Log10 CooksD')
        
    return f, axs
In [8]:
LOGGER.info("Note: the left tail of Log10 plots is truncated, Cook's D values < 10e-10 are not shown")

f, axs = distplot_cooksD(lm_acz_cooksD, lm_acz_cooksD_extreme_lb)

axs = f.get_axes()
for axi, ax in enumerate(axs):
    if axi % 2 == 0:
        ax.set(xlim=(0, 0.02))
    else:
        ax.set(xlim=(-10, -1.0)) 

f.set_size_inches(12, 48)
f.text(0.5, 1.0, r"Cook's D: all epochs $\times$ channels", transform=f.transFigure)

figname_txt = f"{FIG_PREFIX} {FIG_COUNT}"
f.suptitle(figname_txt, **FIG_TAG_SPECS)
f.tight_layout()

FIG_COUNT = udck19_figsave(f, f"{figname_txt}_CooksD_x_channel", FIG_COUNT)
udck19_pipeline_4.ipynb:INFO:Note: the left tail of Log10 plots is truncated, Cook's D values < 10e-10 are not shown

Extreme Cook's D counts: time x channel

In [9]:
# each Time sum across Epochs for each channel
cooksD_counts = is_extreme_cooksD.groupby('Time').sum(axis=0)

# dump count stats as a proportion of the number of epochs
with pd.option_context('display.max_rows', None):
    LOGGER.info(f"""
Extreme Cook's D proportions of epochs x channel
{cooksD_counts.div(N_TRMD_EEG_EPOCHS).describe().unstack()}
""")

# plot
f, ax = plt.subplots(1, 1, figsize=(12,8))

ax.set_title(f"Number of extreme Cook's D EEG samples out of {N_TRMD_EEG_EPOCHS} after EEG artifact trimming")

cooksD_counts.reset_index('Time').plot(
    x='Time', ax=ax, marker='.', lw=0, alpha=.3
)

ax.axvspan(**n4_highlight)

ax.legend(loc='upper left', bbox_to_anchor=(1.0, 1.0))
ax.set(ylabel="Count")
y1, y2 = ax.get_ylim()

fig_tag = f"{FIG_PREFIX} {FIG_COUNT} CooksD counts x time"
f.suptitle(fig_tag, **FIG_TAG_SPECS)
FIG_COUNT = udck19_figsave(f, fig_tag, FIG_COUNT)
udck19_pipeline_4.ipynb:INFO:
Extreme Cook's D proportions of epochs x channel
diagnostic      channel       
cooks_distance  RLPf     count    375.000000
                         mean       0.049173
                         std        0.001550
                         min        0.044839
                         25%        0.048161
                         50%        0.049074
                         75%        0.050237
                         max        0.053309
                RMPf     count    375.000000
                         mean       0.050255
                         std        0.001348
                         min        0.046334
                         25%        0.049406
                         50%        0.050154
                         75%        0.051067
                         max        0.054970
                RDFr     count    375.000000
                         mean       0.050223
                         std        0.001434
                         min        0.046002
                         25%        0.049157
                         50%        0.050154
                         75%        0.051233
                         max        0.054222
                RLFr     count    375.000000
                         mean       0.048401
                         std        0.002055
                         min        0.043179
                         25%        0.046749
                         50%        0.048493
                         75%        0.049988
                         max        0.053973
                RMFr     count    375.000000
                         mean       0.050901
                         std        0.001282
                         min        0.048078
                         25%        0.049988
                         50%        0.050901
                         75%        0.051856
                         max        0.054721
                RMCe     count    375.000000
                         mean       0.051231
                         std        0.001348
                         min        0.047496
                         25%        0.050320
                         50%        0.051233
                         75%        0.052146
                         max        0.056464
                RDCe     count    375.000000
                         mean       0.050815
                         std        0.001342
                         min        0.047164
                         25%        0.049905
                         50%        0.050735
                         75%        0.051690
                         max        0.054970
                RDPa     count    375.000000
                         mean       0.051160
                         std        0.001311
                         min        0.047912
                         25%        0.050237
                         50%        0.051150
                         75%        0.052063
                         max        0.055136
                RLTe     count    375.000000
                         mean       0.050195
                         std        0.001645
                         min        0.045836
                         25%        0.049074
                         50%        0.050320
                         75%        0.051233
                         max        0.054388
                RLOc     count    375.000000
                         mean       0.051138
                         std        0.001467
                         min        0.046915
                         25%        0.050071
                         50%        0.051067
                         75%        0.052230
                         max        0.055136
                RMOc     count    375.000000
                         mean       0.051827
                         std        0.001433
                         min        0.048161
                         25%        0.050901
                         50%        0.051731
                         75%        0.052728
                         max        0.055468
                MiPf     count    375.000000
                         mean       0.049424
                         std        0.001449
                         min        0.045753
                         25%        0.048410
                         50%        0.049489
                         75%        0.050403
                         max        0.054305
                MiCe     count    375.000000
                         mean       0.050850
                         std        0.001274
                         min        0.047330
                         25%        0.049946
                         50%        0.050818
                         75%        0.051773
                         max        0.054804
                MiPa     count    375.000000
                         mean       0.051213
                         std        0.001405
                         min        0.047164
                         25%        0.050403
                         50%        0.051233
                         75%        0.052188
                         max        0.055551
                MiOc     count    375.000000
                         mean       0.052305
                         std        0.001510
                         min        0.047496
                         25%        0.051316
                         50%        0.052396
                         75%        0.053350
                         max        0.056464
                LLPf     count    375.000000
                         mean       0.049986
                         std        0.001381
                         min        0.045005
                         25%        0.049074
                         50%        0.049988
                         75%        0.050942
                         max        0.054056
                LMPf     count    375.000000
                         mean       0.050123
                         std        0.001365
                         min        0.046832
                         25%        0.049282
                         50%        0.050154
                         75%        0.050984
                         max        0.055136
                LDFr     count    375.000000
                         mean       0.050489
                         std        0.001470
                         min        0.045504
                         25%        0.049406
                         50%        0.050486
                         75%        0.051399
                         max        0.054804
                LLFr     count    375.000000
                         mean       0.048488
                         std        0.001622
                         min        0.045088
                         25%        0.047247
                         50%        0.048327
                         75%        0.049489
                         max        0.053641
                LMFr     count    375.000000
                         mean       0.050538
                         std        0.001482
                         min        0.045670
                         25%        0.049572
                         50%        0.050486
                         75%        0.051482
                         max        0.055385
                LMCe     count    375.000000
                         mean       0.051114
                         std        0.001401
                         min        0.046749
                         25%        0.050237
                         50%        0.051233
                         75%        0.052063
                         max        0.054970
                LDCe     count    375.000000
                         mean       0.050880
                         std        0.001474
                         min        0.046832
                         25%        0.049905
                         50%        0.050901
                         75%        0.051980
                         max        0.054970
                LDPa     count    375.000000
                         mean       0.051470
                         std        0.001390
                         min        0.047496
                         25%        0.050486
                         50%        0.051482
                         75%        0.052479
                         max        0.054970
                LLTe     count    375.000000
                         mean       0.049368
                         std        0.001521
                         min        0.045587
                         25%        0.048244
                         50%        0.049240
                         75%        0.050320
                         max        0.054139
                LLOc     count    375.000000
                         mean       0.051071
                         std        0.001441
                         min        0.047580
                         25%        0.050071
                         50%        0.050984
                         75%        0.052063
                         max        0.055634
                LMOc     count    375.000000
                         mean       0.051390
                         std        0.001463
                         min        0.047580
                         25%        0.050486
                         50%        0.051399
                         75%        0.052396
                         max        0.055385
dtype: float64

Cook's D extreme values: epoch x time x channel

In [10]:
%%time 

msg = (f"""
extreme Cooks D trimming bound: {lm_acz_cooksD_extreme_lb:.5f}
""")
LOGGER.info(msg)

# Cook's D extreme value raster: time x epoch
f_raster, axs_raster = plt.subplots(len(EEG_26_STREAMS), 1, figsize=(12, 8*len(EEG_26_STREAMS)))

# Cook's D extreme value raster: time x epoch
f_raster2, axs_raster2 = plt.subplots(len(EEG_26_STREAMS), 1, figsize=(12, 8*len(EEG_26_STREAMS)))

for axi, chan in enumerate(EEG_26_STREAMS):
    # all epochs
    ax = axs_raster[axi]
    ax.set_title(f"Epochs before Cook's D trimming {chan}")
    sns.heatmap(
            # raster data = 1 at and above extreme D cutoff, 0 elsewhere
            data=pd.cut(
                lm_acz_cooksD.loc[:, pd.IndexSlice['cooks_distance', chan]],
                [0, lm_acz_cooksD_extreme_lb, lm_acz_cooksD.max().max()], 
                labels=False).unstack(-1).T,
            ax = ax,
            cmap="Greys",
            cbar=False,
    )
 
    ax = axs_raster2[axi]
    ax.set_title(f"Epochs after Cook's D trimming {chan}")
    sns.heatmap(
            # raster data = 1 at and above extreme D cutoff, 0 elsewhere
            data=pd.cut(
                lm_acz_cooksD.query("Epoch_idx in @trmd_cooksD_epoch_ids")
                .loc[:, pd.IndexSlice['cooks_distance', chan]],
                [0, lm_acz_cooksD_extreme_lb, lm_acz_cooksD.max().max()], 
                labels=False).unstack(-1).T,
            ax = ax,
            cmap="Greys",
            cbar=False,
    )
    
    # plotting is slow, skip rest if prerunning
    if PRERUN:
        break

fig_tag = f"{FIG_PREFIX} {FIG_COUNT}"
f_raster.suptitle(fig_tag, **FIG_TAG_SPECS)
f_raster.tight_layout()
FIG_COUNT = udck19_figsave(
    f_raster,
    f"{fig_tag}_epochs_before_CooksD",
    FIG_COUNT,
    formats=['png']
)

fig_tag = f"{FIG_PREFIX} {FIG_COUNT}"
f_raster2.suptitle(f"{FIG_PREFIX} {FIG_COUNT}", **FIG_TAG_SPECS)
f_raster2.tight_layout()
FIG_COUNT = udck19_figsave(
    f_raster2,
    f"{fig_tag}_epochs_after_CooksD",
    FIG_COUNT,
    formats=['png']
)
udck19_pipeline_4.ipynb:INFO:
extreme Cooks D trimming bound: 0.00033

CPU times: user 39min 5s, sys: 2min 53s, total: 41min 58s
Wall time: 20min 3s

Epochs with more than the allowed extreme Cook's D

The three ranges indicate dataset eeg_1 (10000s) eeg_2 (20000s) and eeg_3 (30000s)

In [11]:
bad_cooksD_epoch_ids = np.array(
    list(set(prepochs_trmd_eeg_df.index.unique('Epoch_idx')).difference(set(trmd_cooksD_epoch_ids)))
)
assert len(trmd_cooksD_epoch_ids) + len(bad_cooksD_epoch_ids) == N_TRMD_EEG_EPOCHS
f, ax = plt.subplots(1, 1, figsize=(16, 4))

# retained
sns.scatterplot(
    x=trmd_cooksD_epoch_ids, 
    y= np.random.random(len(trmd_cooksD_epoch_ids)), 
    ax=ax,
    marker='.',
)

# excluded
sns.scatterplot(
    x=bad_cooksD_epoch_ids, 
    y= -np.random.random(len(bad_cooksD_epoch_ids)), 
    ax=ax,
    marker='.', color='red',
)


ax.set(title="Epochs trimmed for excessive numbers of extreme Cook's D by data set")
_ = ax.set(xlabel='Epoch index', ylabel=('random jitter for visualization'))

figname_txt = f"{FIG_PREFIX} {FIG_COUNT}"
f.suptitle(figname_txt, **FIG_TAG_SPECS)
f.tight_layout()

FIG_COUNT = udck19_figsave(f, f"{figname_txt}_CooksD_trimming_x_epoch", FIG_COUNT)

Exclude epochs by extreme Cook's D criterion, re-standardize cloze

In [12]:
n_trmd_cooksD_epochs = len(trmd_cooksD_epoch_ids)
LOGGER.info(f"""
number of epochs after Cook's D screening / before = 
{n_trmd_cooksD_epochs} / {N_TRMD_EEG_EPOCHS} = {(n_trmd_cooksD_epochs / N_TRMD_EEG_EPOCHS):3.3f}
""")
udck19_pipeline_4.ipynb:INFO:
number of epochs after Cook's D screening / before = 
7346 / 12043 = 0.610

In [13]:
# can't modify the index and columns of a view so make a fresh copy
prepochs_trmd_eeg_cooksD_df = prepochs_trmd_eeg_df.query(
    'Epoch_idx in @trmd_cooksD_epoch_ids'
).copy()

# clobber vestigal index rows
prepochs_trmd_eeg_cooksD_df.index = (
    prepochs_trmd_eeg_cooksD_df.index.remove_unused_levels()
)
prepochs_trmd_eeg_cooksD_df.sort_index(inplace=True)
assert all(trmd_cooksD_epoch_ids == prepochs_trmd_eeg_cooksD_df.index.unique('Epoch_idx'))


# re-standardize cloze for the remaining epochs
cols_to_z = ['article_cloze', 'ART_noun_cloze', 'NA_noun_cloze']
for col in cols_to_z:
    del prepochs_trmd_eeg_cooksD_df[col+"_z"]

prepochs_trmd_eeg_cooksD_df, means_sds = standardize(
    prepochs_trmd_eeg_cooksD_df, col_names=cols_to_z
)
LOGGER.info(f"""{cols_to_z}""")

# log
N_TRMD_EEG_COOKSD_SAMP, N_TRMD_EEG_COOKSD_EPOCHS = check_epochs_shape(prepochs_trmd_eeg_cooksD_df)
LOGGER.info(
    "Prepared epochs after dropping EEG artifacts and Cooks D outliers:"
    f" timestamps: {N_TRMD_EEG_COOKSD_SAMP}, epochs {N_TRMD_EEG_COOKSD_EPOCHS}"
)
LOGGER.info(f"index names: {prepochs_trmd_eeg_cooksD_df.index.names}")
LOGGER.info(f"columns: {prepochs_trmd_eeg_cooksD_df.columns}")

# save 
h5_key = "eeg_screen_cooksD"
LOGGER.info(f"""
Writing Cooks D screened EEG data epochs
HDF5: {PREPOCHS_TRMD_EEG_COOKSD_F}, key={h5_key}
""")
prepochs_trmd_eeg_cooksD_df.to_hdf(PREPOCHS_TRMD_EEG_COOKSD_F, key="eeg_screen_cooksD", mode='w')
udck19_pipeline_4.ipynb:INFO:['article_cloze', 'ART_noun_cloze', 'NA_noun_cloze']
udck19_pipeline_4.ipynb:INFO:Prepared epochs after dropping EEG artifacts and Cooks D outliers: timestamps: 375, epochs 7346
udck19_pipeline_4.ipynb:INFO:index names: ['Epoch_idx', 'Time']
udck19_pipeline_4.ipynb:INFO:columns: Index(['expt', 'sub_id', 'item_id', 'h5_dataset', 'dataset_index',
       'event_code', 'regex_match', 'regex_anchor', 'garv_reject', 'article',
       'adjective', 'noun', 'article_cloze', 'ART_noun_cloze', 'NA_noun_cloze',
       'lle', 'lhz', 'MiPf', 'LLPf', 'RLPf', 'LMPf', 'RMPf', 'LDFr', 'RDFr',
       'LLFr', 'RLFr', 'LMFr', 'RMFr', 'LMCe', 'RMCe', 'MiCe', 'MiPa', 'LDCe',
       'RDCe', 'LDPa', 'RDPa', 'LMOc', 'RMOc', 'LLTe', 'RLTe', 'LLOc', 'RLOc',
       'MiOc', 'A2', 'HEOG', 'rle', 'rhz', 'article_item_id', 'ptp_excursion',
       'blocked', 'garv_blink', 'garv_screen', 'eeg_screen', 'article_cloze_z',
       'ART_noun_cloze_z', 'NA_noun_cloze_z'],
      dtype='object')
udck19_pipeline_4.ipynb:INFO:
Writing Cooks D screened EEG data epochs
HDF5: /mnt/cube/home/turbach/papers/udck19/analysis/data/epochs/prepochs_trimd_eeg_cooksD.h5, key=eeg_screen_cooksD

/home/turbach/.conda/envs/udck19_pnas_121819/lib/python3.6/site-packages/pandas/core/generic.py:2530: PerformanceWarning: 
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block3_values] [items->['expt', 'sub_id', 'item_id', 'h5_dataset', 'article', 'adjective', 'noun', 'article_item_id', 'garv_screen', 'eeg_screen']]

  pytables.to_hdf(path_or_buf, key, self, **kwargs)

Refit KIM and KIP LMER models after trimming by extreme Cook's D

Candidate models from pipeline_3

    KIM: article_cloze_z + (1 | expt) + (article_cloze_z | sub_id) + (1 | article_item_id)
    KIP: article_cloze_z + (1 | expt) + (1 | sub_id) + (1 | article_item_id)
In [14]:
lmer_acz_comps = {
    "KIM": [
        'article_cloze_z + (1 | expt) + (article_cloze_z | sub_id) + (1 | article_item_id)',
        '(1 | expt) + (article_cloze_z | sub_id) + (1 | article_item_id)',
    ],
    
    "KIP": [
        'article_cloze_z + (1 | expt) + (1 | sub_id) + (1 | article_item_id)',
        '(1 | expt) + (1 | sub_id) + (1 | article_item_id)',
    ]
}
In [15]:
# for debugging only 
# prepochs_trmd_eeg_cooksD_df = pd.read_hdf(PREPOCHS_TRMD_EEG_COOKSD_F, key="eeg_screen_cooksD")

# load into fitgrid
if PRERUN:
    step = 5
    time_slice = pd.IndexSlice[:, slice(-200, 600, step)]
    LMER_CHANNELS = LMER_CHANNELS = ['MiPf', 'MiCe', 'MiPa', 'MiOc'] # EEG_26_STREAMS
    pfx = f'step{step}_chans{len(LMER_CHANNELS)}_'
    modl_path = EEG_MODELING_DIR / "prerun"
else:
    time_slice = pd.IndexSlice[:, :]
    LMER_CHANNELS = EEG_26_STREAMS
    pfx = ""
    modl_path = EEG_MODELING_DIR

prepochs_trmd_eeg_cooksD_fg = fitgrid.epochs_from_dataframe(
    prepochs_trmd_eeg_cooksD_df
    .loc[time_slice, RHS_VARS + LMER_CHANNELS],
    epoch_id='Epoch_idx',
    time='Time',
    channels=LMER_CHANNELS
)

assert modl_path.exists()
In [16]:
# model formulae to fit
models = [m for ms in lmer_acz_comps.values() for m in ms]

# save as
lmer_acz_ranef_cooksD_f = modl_path / (pfx + LMER_ACZ_RANEF_COOKSD_F)

# set summary fitter parameters
lmer_fitter = functools.partial(
    fgutil.summary.summarize,
    modeler='lmer', 
    LHS=LMER_CHANNELS,
    parallel=True, 
    n_cores=32,
    REML=False
)


with warnings.catch_warnings():
    warnings.simplefilter("ignore") 

    fit_lmer_formulas(
        prepochs_trmd_eeg_cooksD_fg,
        lmer_fitter, 
        models,
        lmer_acz_ranef_cooksD_f,
        LOGGER,
    )
    
udck19_pipeline_4.ipynb:INFO:
    random effects structures
    file: /mnt/cube/home/turbach/papers/udck19/analysis/measures/modeling/lmer_acz_ranef_cooksD.h5
    ['article_cloze_z + (1 | expt) + (article_cloze_z | sub_id) + (1 | '
 'article_item_id)',
 '(1 | expt) + (article_cloze_z | sub_id) + (1 | article_item_id)',
 'article_cloze_z + (1 | expt) + (1 | sub_id) + (1 | article_item_id)',
 '(1 | expt) + (1 | sub_id) + (1 | article_item_id)']
    
udck19_pipeline_4.ipynb:INFO:fitting article_cloze_z + (1 | expt) + (article_cloze_z | sub_id) + (1 | article_item_id)
19.Dec 2019 13:24:22
100%|██████████| 375/375 [00:35<00:00, 10.56it/s]
udck19_pipeline_4.ipynb:INFO:Elapsed time: 0:26:10.883746

udck19_pipeline_4.ipynb:INFO:fitting (1 | expt) + (article_cloze_z | sub_id) + (1 | article_item_id)
19.Dec 2019 13:50:33
100%|██████████| 375/375 [00:32<00:00, 11.40it/s]
udck19_pipeline_4.ipynb:INFO:Elapsed time: 0:26:26.082880

udck19_pipeline_4.ipynb:INFO:fitting article_cloze_z + (1 | expt) + (1 | sub_id) + (1 | article_item_id)
19.Dec 2019 14:16:59
100%|██████████| 375/375 [00:34<00:00, 10.88it/s]
udck19_pipeline_4.ipynb:INFO:Elapsed time: 0:12:25.169995

udck19_pipeline_4.ipynb:INFO:fitting (1 | expt) + (1 | sub_id) + (1 | article_item_id)
19.Dec 2019 14:29:24
100%|██████████| 375/375 [00:34<00:00, 10.81it/s]
udck19_pipeline_4.ipynb:INFO:Elapsed time: 0:12:04.355574

In [17]:
for x0, x1 in [(-1500, 1500), (-200, 600)]:
    for proc, comp in lmer_acz_comps.items():
 
        f, axs = fgutil.summary.plot_AICmin_deltas(
            read_fg_summaries_hdf(lmer_acz_ranef_cooksD_f, comp).query("Time >= @x0 and Time <= @x1")
        )

        f.set_size_inches(12, 8)
        fig_tag = f"{FIG_PREFIX} {FIG_COUNT} {proc} {x0} {x1} AIC CooksD trimmed"
        f.suptitle(fig_tag, **FIG_TAG_SPECS)
 
        n_rows, n_cols = axs.shape
        for row in range(n_rows):
            ax = axs[row, 0]
            
            # highlight
            for axj in [0,1]:
                axs[row, axj].axvspan(**n4_highlight)
           
            # panel label    
            ax.text(
                x=-0.1,
                y=1.0,
                s=f"{panel_from_idx(row)})", 
                horizontalalignment='right',
                fontsize='x-large',
                fontweight='bold',
                transform=ax.transAxes
            )

            ax.set(ylim=(0,25))
            if ax.get_legend():
                ax.get_legend().remove()
            if row == n_rows - 1:
                ax.legend(loc='upper left', bbox_to_anchor=(0, -0.1), ncol=4)

        FIG_COUNT = udck19_figsave(f, fig_tag, FIG_COUNT)
        

Standardized change in model parameters $\Delta \hat{\beta_{j}}$ for Cook's D trimming

The impact of excluding the epochs flagged for the proportion of extreme Cook's D is quantified as the change in the estimates scaled by the standard error of the estimate for the trimmed set.

$\Delta \hat{\beta_{j}} = \frac{\hat{\beta}_{j} - \hat{\beta}_{j(trimmed)}}{SE(\hat{\beta}_{j(trimmed)})}$

In [18]:
cooksD_KIM_KIP = {'KIM': {}, 'KIP': {}}
for key, models in lmer_acz_comps.items():
    full_model = models[0] # first of pair is full, second reduced

    qstr = "model == @full_model and key in ['Estimate', 'SE']"
    
    # full dataset from pipeline_2 initial LMER fitting after eeg artifact screening
    lmer_acz_ranefs_full = read_fg_summaries_hdf(
        EEG_MODELING_DIR / "lmer_acz_ranef.h5", [full_model]
    ).query(qstr)[LMER_CHANNELS]
    lmer_acz_ranefs_full.index = lmer_acz_ranefs_full.index.remove_unused_levels()

    # reduced data set from this pipeline_4 after screening Cook's D extreme values
    lmer_acz_ranefs_cooksD_full = read_fg_summaries_hdf(
        lmer_acz_ranef_cooksD_f, [full_model]
    ).query(qstr)[LMER_CHANNELS]

    # ensure the two summaries are commensurable when prerunning
    if PRERUN:
        cooksD_times = lmer_acz_ranefs_cooksD_full.index.unique('Time')
        lmer_acz_ranefs_full = lmer_acz_ranefs_full.query('Time in @cooksD_times')
        lmer_acz_ranefs_full.index = lmer_acz_ranefs_full.index.remove_unused_levels()
 
    assert all(lmer_acz_ranefs_full.index == lmer_acz_ranefs_cooksD_full.index)
    assert all(lmer_acz_ranefs_full.columns == lmer_acz_ranefs_cooksD_full.columns)
    

    # grid of beta difference with minus without Cooks D outliers 
    delta_betas_grid = lmer_acz_ranefs_full.sub(
        lmer_acz_ranefs_cooksD_full
    ).query("key == 'Estimate'").reset_index('key', drop=True)
   

    # SEs from Cook's D subset
    qstr_SE = "model == @full_model and key == 'SE'"
    beta_SEs_grid = lmer_acz_ranefs_cooksD_full.query(qstr_SE)
    std_delta_betas = (delta_betas_grid / beta_SEs_grid)
    

    f, axs = plt.subplots(2, 1, figsize=(12, 10))

    for axi, (beta, std_dfb) in enumerate(std_delta_betas.reset_index('Time').groupby('beta')):
        ax = axs[axi]
        ax.set(ylim=(-3.0, 3.0))
        ax.set_title(beta, loc='center')
        

        std_dfb.plot(x='Time', ax=ax)
        
        # highlight
        ax.axvspan(**n4_highlight)
        
        ax.axhline(y=0, lw=1.0, color='grey')
        ax.axhline(y=-2.0, lw=.75, color='grey')
        ax.axhline(y=2.0, lw=.75, color='grey')
        if axi > 0:
            ax.get_legend().remove()
        else:
            ax.text(-0.1, 1.0, key, transform=ax.transAxes, fontsize='x-large', fontweight='bold')
            ax.legend(loc='upper left', bbox_to_anchor=(1.0, 1.0))
        
        ax.set_ylabel(
            (
                r" $\frac{\hat{\beta}_{j} - \hat{\beta}_{j(trimmed)}}"
                r"{SE(\hat{\beta}_{j(trimmed)}}$"
            ),
            fontsize=24
        )
    fig_tag = f"{FIG_PREFIX} {FIG_COUNT} DFBETA CooksD trimmed"
    f.suptitle(fig_tag, **FIG_TAG_SPECS)
 
    f.subplots_adjust(hspace=.25)
    FIG_COUNT = udck19_figsave(f, fig_tag, FIG_COUNT)
In [19]:
from udck19_utils import plotchans, MPL_32_CHAN, MPL_MIDLINE

plt.close('all')

# figure params
beta_kws = {
    "(Intercept)": {
        'margins': {
            'bottom': 0.15
        }, 
        'axes': {"ylim": (-4, 4)},  # these scale y-extent of the waveforms
        'cal': {
            'ylabel': "$\mu V$",
            'yticks': (-2, 2),
        },
        'chan_label': 'north',
        
        
    },
    
    "article_cloze_z": {
        'margins': {
            'bottom': 0.15
        }, 
        'axes': {"ylim": (-.25, 0.75)},
        'cal': {
            'ylabel': "$\mu V / SD_{article_cloze}$",
            'yticks': (0, .5),
        },
        'chan_label': 'north'

        
    },
}

intervals = [(-1500, 1500), (-200, 600)]

# fetch the different random effects structures for article_cloze_z
models = [
    model 
    for models in lmer_acz_comps.values() 
    for model in models 
    if re.match(r"^article_cloze_z", model)
]
style='seaborn-bright'

# plot 
for x0, x1 in intervals:
    rerps = read_fg_summaries_hdf(
        lmer_acz_ranef_cooksD_f, models
    ).query('Time >= @x0 and Time <= @x1')

    rerps.index = rerps.index.remove_unused_levels()
    rerps.index.unique('model')
    plots = plotchans(rerps, beta_kws, style=style, layout=MPL_32_CHAN, se_ci="CI")
    for plot in plots:
        beta = plot['beta']
        f = plot['fig']
        for ax in f.get_axes():
            ax.axvspan(**n4_highlight)

        suptitle_txt = f._suptitle.get_text()
        x, y = f._suptitle.get_position()
        f.suptitle(
            f"{FIG_PREFIX} {FIG_COUNT}\n{suptitle_txt}",
            x=x,
            y=y,
            fontsize='x-large',
            fontweight='bold'
        )
        FIG_COUNT = udck19_figsave(f, f"{FIG_PREFIX}_{FIG_COUNT}_{beta}_{x0}_{x1}", FIG_COUNT)
In [20]:
# log execution time
pipeline_stop = datetime.datetime.now()

elapsed =  pipeline_stop - pipeline_start
LOGGER.info(f"""
Done {pipeline_stop.strftime("%d.%b %Y %H:%M:%S")}
Elapsed time: {elapsed}
""")
udck19_pipeline_4.ipynb:INFO:
Done 19.Dec 2019 14:44:26
Elapsed time: 1:50:09.645103